# To use Spark 1.6.3 in Jupyter Notebook we have to use Python 3.4
!python --version
# USER to determine spark and dataset directories
import getpass
USER = getpass.getuser()
if USER == 'Tarmo':
SPARK_DIR = 'C:/Users/Tarmo/Documents/Lausanne/CS-401_applied_data_analysis/spark/spark-1.6.3-bin-hadoop2.6'
DATASET_DIR = 'C:/Users/Tarmo/Documents/Lausanne/data/Books_5.json'
METADATA_DIR = 'C:/Users/Tarmo/Documents/Lausanne/data/meta_Books.json'
elif USER == 'adam':
SPARK_DIR = '/home/adam/EPFL_courses/spark-1.6.3-bin-hadoop2.6'
DATASET_DIR = '/media/adam/B236CB1D36CAE209/Studia/ADA/reviews_Books_5.json'
METADATA_DIR = '/media/adam/B236CB1D36CAE209/Studia/ADA/meta_Books.json'
else:
SPARK_DIR = '/home/adam/EPFL_courses/spark-1.6.3-bin-hadoop2.6'
SPARK_DIR
import findspark
findspark.init(SPARK_DIR)
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import to_date, unix_timestamp, from_unixtime #to_timestamp, pyspark 2.2 functiona
import pandas as pd
from scipy import stats
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.utils import resample
from scipy import stats
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import json
import re
import json
import gzip
from collections import Counter
import operator
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
Loading and processing the data to see what it contains. To make further processing faster, we create intermediate well-structured csv files to avoid having to process the JSON data files every time we want to look at the data. We take a look at several different metrics and their aggregates in the data (review length, review number; by day, by month, by year, by book, etc.)
sc = SparkContext()
sqlContext = SQLContext(sc)
sc.version
# Load the dataset and create RDDs
text_file = sc.textFile(DATASET_DIR)
# Convert previously read text file to json DataFrame
df = sqlContext.read.json(text_file)
# Load metadata for dataset and convert it to DataFrame
metadata = sc.textFile(METADATA_DIR)
metadata_df = sqlContext.read.json(metadata)
# Register DataFrames as tables to use those names in SQL-type queries
sqlContext.registerDataFrameAsTable(metadata_df, "metadata")
sqlContext.registerDataFrameAsTable(df, "dataset")
# Number of RDDs
text_file.getNumPartitions()
# Number of records in the dataset
text_file.count()
# Schema of the dataset
df.printSchema()
# Bunch of castings, reviewTime: string->date, unixReviewTime: int->timestamp
# We do this to be able to better filter and manipulate the data
df = df.withColumn('unixReviewTime', from_unixtime(df['unixReviewTime']))
df = df.withColumn('reviewTime', to_date(df['unixReviewTime']))
df = df.withColumn('unixReviewTime', df['unixReviewTime'].cast('timestamp'))
df.printSchema()
# Look at couple of records, just to be sure that we obtained what we wanted
df.select("reviewTime", 'reviewText', 'unixReviewTime').take(2)
# Computing an average length of review per day
aTuple = (0, 0)
avg = df.select("reviewTime", 'reviewText').rdd.map(lambda row: (row.reviewTime, len(row.reviewText)))
avg = avg.aggregateByKey(aTuple, lambda a,b: (a[0] + b, a[1] + 1), lambda a,b: (a[0] + b[0], a[1] + b[1]))
avg = avg.mapValues(lambda v: v[0]/v[1])
avg = avg.collect()
# Processing acquired data using Pandas
avg_len = pd.DataFrame(avg, columns=['Date', 'Avg_length'])
avg_len['Date'] = pd.to_datetime(avg_len['Date'])
avg_len.set_index('Date', inplace=True)
avg_len.sort_index(inplace=True)
avg_len.head()
# Save to file not to compute this one more time
avg_len.to_csv("avg_length_review_by_day.csv")
avg_len = pd.read_csv("avg_length_review_by_day.csv", index_col='Date', parse_dates=['Date'])
avg_len.head()
monthly_data = avg_len.groupby(avg_len.index.to_period('M')).mean()
monthly_data.plot(figsize=(20,10))
We notice that the average review length per month appears to have three somewhat stable levels in time: before 2000, between 2000 and 2013 and after 2013. In the middle period the review length is consistently high, and crashes around 2012-2013. In the beginning of the dataset time period, the number of reviews is small so there is high variation in monthly average.
monthly_data['2012':'2013'].plot(figsize=(20,10))
The extreme drop is located between August 2012 and January 2013.
# Save processed data
monthly_data.to_csv("avg_length_review_by_month.csv")
# (Successful) attempt to filter by timestamp
tmp = df.rdd.filter(lambda row: row.unixReviewTime > pd.to_datetime('2012-05')
and row.unixReviewTime < pd.to_datetime('2013'))
tmp.take(5)
# Number of reviews per each day
number_of_reviews = df.rdd.map(lambda row: (row.reviewTime, 1)).reduceByKey(lambda a, b: a+b).collect()
# Processing the data
rev_num = pd.DataFrame(number_of_reviews, columns=['Date', 'Number of reviews'])
rev_num['Date'] = pd.to_datetime(rev_num['Date'])
rev_num.set_index('Date', inplace=True)
rev_num.sort_index(inplace=True)
rev_num.head()
# Save the data not to compute over and over
rev_num.to_csv("number_of_reviews_per_day.csv")
# Read file with the data
rev_num = pd.read_csv("number_of_reviews_per_day.csv", index_col='Date', parse_dates=['Date'])
rev_num.tail()
monthly_data_reviews = rev_num.groupby(rev_num.index.to_period('M')).sum()
# Save the processed data
monthly_data_reviews.to_csv("number_of_reviews_per_month.csv")
# Read file with the data
monthly_data_reviews = pd.read_csv("number_of_reviews_per_month.csv", index_col='Date', parse_dates=['Date'])
monthly_data_reviews.plot(figsize=(20,10))
We see that in the beginning of the data period, there are hardly any reviews being written. The number of reviews increases consistently until there is a huge jump after 2012. The extreme increase coincides with the drop in the average review length, and may reflect a sudden change in reviewer numbers and/or behavior. The apparent increase might also be due to changes in Amazon review policies: after a significant change, Amazon might retroactively remove old reviews not conforming to the new policy.
number_of_reviews_per_book = df.rdd.map(lambda row: (row.asin, 1)).reduceByKey(lambda a, b: a+b).collect()
len(number_of_reviews_per_book)
# Processing the data using Pandas and saving it to csv file
df_number_of_reviews_per_book = pd.DataFrame(number_of_reviews_per_book, columns=['Book_id', 'Number of reviews'])
df_number_of_reviews_per_book.sort_values('Number of reviews', ascending=False, inplace = True)
df_number_of_reviews_per_book.to_csv("number_of_reviews_per_book.csv", index=False)
df_number_of_reviews_per_book.head(10)
# Read file with the data
df_number_of_reviews_per_book = pd.read_csv("number_of_reviews_per_book.csv", index_col='Book_id')
# Boxplot of numbers of reviews
sns.boxplot(np.array(df_number_of_reviews_per_book['Number of reviews'].values), orient='v')
The boxplot is not very box-like, since most books have only a small number of reviews.
We look at the numbers without the outliers:
sns.boxplot(np.array(df_number_of_reviews_per_book['Number of reviews'].values), orient='v', showfliers=False)
# Basic statistics for this variable
stats.describe(df_number_of_reviews_per_book['Number of reviews'].values)
Our time-independent popularity metric is the number of reviews, so we take a closer look at the top 30 books according to it.
top_books = df.rdd.map(lambda row: (row.asin, 1)).reduceByKey(lambda a, b: a+b).sortBy(lambda wc: -wc[1]).take(30)
top_books_df = sqlContext.createDataFrame(top_books, ['asin', 'rew_num'])
sqlContext.registerDataFrameAsTable(top_books_df, "top_books")
top_books_df.take(5)
# Join the dataset table with the metadata table to see titles of the most popular books
sqlContext.sql("select t.asin, m.title from metadata m join top_books t on m.asin=t.asin").collect()
sqlContext.sql("select t.asin, m.title from metadata m join top_books t limit 10").collect()
most_reviewed_books_id_top = df_number_of_reviews_per_book[:30]
most_reviewed_top = df.rdd.filter(lambda row: row.asin in list(most_reviewed_books_id_top.Book_id))
.map(lambda row: (row.asin, row.reviewTime)).collect()
most_reviewed_top_2 = sqlContext.sql("select asin, reviewTime from dataset where asin in " +
str(tuple(most_reviewed_books_id_top.Book_id)))
# Processing the data using Pandas
most_reviewed_books_top30_df = pd.DataFrame(most_reviewed_top, columns=['asin', 'reviewTime'])
# Convert to datetime type
most_reviewed_books_top30_df['reviewTime'] = pd.to_datetime(most_reviewed_books_top30_df['reviewTime'])
# Assign number of review to compute the sum
most_reviewed_books_top30_df['Number_of_reviews'] = 1
# Create monthly period for aggregation purpose
most_reviewed_books_top30_df['Year-month'] = most_reviewed_books_top30_df['reviewTime'].dt.to_period('M')
most_reviewed_books_top30_df.head()
# Save data describing number of reviews per day for each book in top30
most_reviewed_books_top30_df.groupby(['asin', 'reviewTime']).sum()
.to_csv("number_of_reviews_per_day_top30_books.csv")
# Aggregating the data by month
m_rev_books_by_month = most_reviewed_books_top30_df.groupby(['asin', 'Year-month'], as_index=True).sum()
m_rev_books_by_month.to_csv("number_of_reviews_per_month_top30_books.csv")
m_rev_books_by_month = pd.read_csv("number_of_reviews_per_month_top30_books.csv").set_index(['asin', 'Year-month'])
number_of_reviews_per_month_top30_books_UNSTACKED_df = m_rev_books_by_month.unstack(level=0)
number_of_reviews_per_month_top30_books_UNSTACKED_df.to_csv("number_of_reviews_per_month_top30_books_UNSTACKED.csv")
number_of_reviews_per_month_top30_books_UNSTACKED_df.plot(figsize = (20,10))
We see that for the top 30 books, most reviews are made after January 2013.
number_of_reviews_per_month_top30_books_UNSTACKED_df['2013':].plot(figsize = (20,10))
A top 30 book accrues a lot of reviews right after publication (or rather, appearance of first review). It is highly book-dependent whether this rate of reviewing drops quickly or slowly.
number_of_reviews_per_month_top30_books_cumsum_df = number_of_reviews_per_month_top30_books_UNSTACKED_df.cumsum()
number_of_reviews_per_month_top30_books_cumsum_df.shape
number_of_reviews_per_month_top30_books_cumsum_df.plot(figsize = (20,10))
aTuple = (0, 0)
avg_len_review = df.select('asin', 'reviewText').rdd.map(lambda row: (row.asin, len(row.reviewText)))
avg_len_review = avg_len_review.aggregateByKey(aTuple, lambda a,b: (a[0] + b, a[1] + 1), lambda a,b: (a[0] + b[0], a[1] + b[1]))
avg_len_review = avg_len_review.mapValues(lambda v: (v[0]/v[1], v[1]))
avg_len_review = avg_len_review.collect()
# Transformation of the data to be able to load it as a DataFrame
avg_len_review = [(k, v1, v2) for k, (v1, v2) in avg_len_review]
# Processing and saving to file
avg_len_review_per_book_df = pd.DataFrame(avg_len_review, columns=['Book_id', 'Avg_len', 'number_of_reviews'])
avg_len_review_per_book_df.sort_values(['Avg_len', 'number_of_reviews'], ascending=False, inplace=True)
avg_len_review_per_book_df.to_csv("avg_length_and_number_of_reviews_per_book.csv", index=False)
avg_len_review_per_book_df.head(10)
avg_len_review_per_book_df = pd.read_csv("avg_length_and_number_of_reviews_per_book.csv",index_col="Book_id")
avg_len_review_per_book_df.head()
avg_len_review_per_book_df.plot(x = 'Avg_len', y = 'number_of_reviews', kind = 'scatter', figsize = (20,10),
alpha = 0.3, title = 'Number of reviews to average length for each book')
avg_len_review_per_book_df[avg_len_review_per_book_df['number_of_reviews'] > 10].plot(x = 'Avg_len', y = 'number_of_reviews', kind = 'scatter', figsize = (20,10),
alpha = 0.3, title = 'Number of reviews to average length for books with 10 and more reviews')
Book serial number, date of a review, rating, length of review.
ratings_with_length = df.select("asin", "unixReviewTime", "reviewerID", 'reviewText', 'overall', 'helpful').rdd.map(lambda row: (row.asin, row.unixReviewTime, row.reviewerID, len(row.reviewText), row.overall, row.helpful)).collect()
ratings_with_length_df = pd.DataFrame(ratings_with_length, columns = ["asin", "unixReviewTime", "reviewerID", 'review_length', 'overall', 'helpful'])
ratings_with_length_df['unixReviewTime'] = pd.to_datetime(ratings_with_length_df['unixReviewTime'])
ratings_with_length_df['overall'] = pd.to_numeric(ratings_with_length_df['overall'], downcast='unsigned')
ratings_with_length_df['review_length'] = pd.to_numeric(ratings_with_length_df['review_length'])
ratings_with_length_df.dtypes
ratings_with_length_df['helpful_yes'] = ratings_with_length_df['helpful'].apply(lambda x: x[0])
ratings_with_length_df['helpful_yes'] = pd.to_numeric(ratings_with_length_df['helpful_yes'])
ratings_with_length_df['helpful_no'] = ratings_with_length_df['helpful'].apply(lambda x: x[1])
ratings_with_length_df['helpful_no'] = pd.to_numeric(ratings_with_length_df['helpful_no'])
ratings_with_length_df.drop('helpful', inplace = True, axis = 1)
ratings_with_length_df.to_csv("ratings_with_length.csv", index=False)
ratings_with_length_df.describe()
fig, ax = plt.subplots()
fig.set_figwidth(20)
sns.boxplot(y = 'overall', x = 'review_length', data = ratings_with_length_df, orient='h',ax = ax)
fig, ax = plt.subplots()
fig.set_figwidth(20)
sns.boxplot(y = 'overall', x = 'review_length', data = ratings_with_length_df, orient='h', ax = ax, showfliers=False)
Each row contains: book serial number, date of the review, rating (overall). We construct an intermediate file where each book (asin) is associated with five column features, which give the number of ratings from 1 to 5, respectively.
Rating is in stars (integers), from 1 to 5.
ratings = df.select("asin", "reviewTime", "overall").rdd.map(lambda row: (row.asin, row.reviewTime, int(row.overall))).collect()
# Create a DataFrame from the dataset collected before
ratings_df = pd.DataFrame(ratings, columns=['asin', 'Date', 'rating'])
ratings_df['Date'] = pd.to_datetime(ratings_df['Date'])
ratings_df.shape
ratings_df
# Save the data
ratings_df.to_csv("ratings.csv", index=False)
# Read the data from the file
ratings_df = pd.read_csv("ratings.csv", parse_dates=['Date'])
# Number of 1, 2, 3, 4, 5 - star ratings in each month
ratings_df['Year-month'] = ratings_df['Date'].dt.to_period('M')
ratings_grouped_df = ratings_df[['Year-month', 'rating', 'asin']].groupby(['Year-month', 'rating']).count()
ratings_grouped_unstack_date_df = ratings_grouped_df.unstack(level=1)
ratings_grouped_unstack_date_df.head()
ratings_grouped_unstack_date_df.plot(figsize = (20,10))
# Aggregate data by number of each 1, 2, 3, 4, 5 ratings for each book
ratings_grouped_asin_df = ratings_df[['Year-month', 'rating', 'asin']].groupby(['asin', 'rating']).count()
ratings_grouped_asin_unstack_df = ratings_grouped_asin_df.unstack(level=1, fill_value=0)
ratings_grouped_asin_unstack_df.columns = ratings_grouped_asin_unstack_df.columns.droplevel()
ratings_grouped_asin_unstack_df.head()
# Save aggregated data to file
ratings_grouped_asin_unstack_df.to_csv("ratings_grouped_unstack.csv")
# Read data from file
ratings_grouped_unstack_df = pd.read_csv('ratings_grouped_unstack.csv', index_col='asin')
ratings_grouped_unstack_df.rename(columns={1:"1", 2:"2", 3:"3", 4:"4", 5:"5"}, inplace=True)
ratings_grouped_unstack_df.head()
Column 1 contains the number of 1-star ratings, etc.
# Calculating weighted mean and standard deviation
def weighted_avg_and_std(values, weights):
"""
Return the weighted average and standard deviation.
values, weights -- Numpy ndarrays with the same shape.
"""
average = np.average(values, weights=weights, axis=1)
variance = np.average(((values.T - average).T)**2, weights=weights, axis=1)
return (average, np.sqrt(variance))
ratings_grouped_unstack_df['Weighted_mean'], ratings_grouped_unstack_df[
'Std_dev'] = weighted_avg_and_std(
values=np.broadcast_to([1, 2, 3, 4, 5],
(ratings_grouped_unstack_df.shape[0], 5)),
weights=np.array(ratings_grouped_unstack_df[['1', '2', '3', '4',
'5']].values))
ratings_grouped_unstack_df['Number_of_reviews'] = np.sum(
np.array(ratings_grouped_unstack_df[['1', '2', '3', '4', '5']].values),
axis=1)
ratings_grouped_unstack_df.head()
ratings_grouped_unstack_df = pd.merge(
ratings_grouped_unstack_df,
avg_len_review_per_book_df,
right_index=True,
left_index=True)
ratings_grouped_unstack_df.drop('number_of_reviews', inplace=True, axis=1)
ratings_grouped_unstack_df.head()
ratings_grouped_unstack_df.describe()
g = sns.pairplot(ratings_grouped_unstack_df)
g.set(xticklabels=[])
ratings_grouped_unstack_df.plot(y = 'Std_dev', kind = 'hist', bins = 40, figsize = (20,7),
title = 'Standard deviation of ratings for books')
ratings_grouped_unstack_df[ratings_grouped_unstack_df['Number_of_reviews'] > 9].plot(y = 'Std_dev',
kind = 'hist', bins = 40, figsize = (20,7),
title = 'Standard deviation of ratings for books with 10 or more reviews')
ratings_grouped_unstack_df.plot(y = 'Std_dev', kind = 'box')
ratings_grouped_unstack_df.plot(x = 'Weighted_mean', y = 'Std_dev', kind = 'scatter', alpha = 0.3, figsize = (20,7))
ratings_grouped_unstack_df[ratings_grouped_unstack_df['Number_of_reviews'] > 9].plot(x = 'Weighted_mean', y = 'Std_dev', kind = 'scatter', alpha = 0.3, figsize = (20,7))
ratings_grouped_unstack_df.plot(x = 'Weighted_mean', y = 'Number_of_reviews', kind = 'scatter', alpha = 0.3, figsize = (15,7))
ratings_grouped_unstack_df.plot(x = 'Weighted_mean', y = 'Avg_len', kind = 'scatter', figsize = (20,7), alpha = 0.3)
ratings_grouped_unstack_df[ratings_grouped_unstack_df['Number_of_reviews'] > 100].plot(x = 'Std_dev', y = 'Avg_len', kind = 'scatter', figsize = (20,7), alpha = 0.3)
books_low_std = ratings_grouped_unstack_df[ratings_grouped_unstack_df['Std_dev'] < 0.507]
books_high_std = ratings_grouped_unstack_df[ratings_grouped_unstack_df['Std_dev'] > 1.18]
print(books_low_std.shape[0], books_high_std.shape[0])
books_low_std.describe()
books_high_std.describe()
fig, ax1 = plt.subplots(figsize=(15, 4))
bp = sns.boxplot(data = [books_low_std['Avg_len'].values, books_high_std['Avg_len'].values], orient = 'h')
ax1.set_yticklabels(['Low std','High std'])
ax1.set_xlabel("Average review length")
stats.ks_2samp(books_low_std['Avg_len'].values, books_high_std['Avg_len'].values)
We can reject hypothesis that these two samples come from the same distribution, so we proved that books with high std dev have on average longer reviews that those with low std_dev.
fig, ax1 = plt.subplots(figsize=(15, 6))
bp = sns.boxplot(data = [books_low_std['Avg_len'].values, books_high_std['Avg_len'].values], orient = 'h', showfliers=False)
ax1.set_yticklabels(['Low std','High std'])
ax1.set_xlabel("Average review length")
def draw_plot(data, offset,edge_color, fill_color):
pos = np.arange(data.shape[1])+offset
bp = ax.boxplot(data, positions = pos, widths=0.3, patch_artist=True, manage_xticks=False)
# for element in ['boxes', 'whiskers', 'fliers', 'medians', 'caps']:
# plt.setp(bp[element], color=edge_color)
# for patch in bp['boxes']:
# patch.set(facecolor=fill_color)
fig, ax = plt.subplots(figsize = (20, 10))
draw_plot(books_low_std.ix[:, books_low_std.columns != "Avg_len"].values, -0.2, "tomato", "white")
draw_plot(books_high_std.ix[:, books_high_std.columns != "Avg_len"].values, +0.2,"skyblue", "white")
ax.set_xticks(np.arange(books_low_std.shape[1] - 1))
ax.set_xticklabels(books_low_std.columns)
ax.set_ylim([-5, 50])
#plt.savefig(__file__+'.png', bbox_inches='tight')
plt.show()
plt.close()
Overall popularity measure (#reviews / #days from first review)
# Overall popularity measure (#reviews / #days from first review)
ratings_gb_df = ratings_df[['asin', 'Date', 'rating']].groupby('asin').agg({
'Date':
np.min,
'rating':
'count'
})
ratings_gb_df.columns = ['first_review_date', 'number_of_reviews']
last_review = ratings_df.Date.max()
ratings_gb_df[
'days_since_first_review'] = last_review - ratings_gb_df['first_review_date']
ratings_gb_df['days_since_first_review'] = ratings_gb_df[
'days_since_first_review'].astype('timedelta64[D]')
ratings_gb_df['popularity_factor'] = ratings_gb_df[
'number_of_reviews'] / ratings_gb_df['days_since_first_review']
popularity_measure1_df = ratings_gb_df.sort_values(
'popularity_factor', ascending=False)
popularity_measure1_df.head()
Computing popularity measure equal to the biggest 3-month-period review increase (the biggest first derivative over accumulated number of reviews with delta equal 3 month)
# DataFrame having all possible months
first_review = ratings_df.Date.min()
first_review_month = first_review.to_period("M")
last_review = ratings_df.Date.max()
last_review_month = last_review.to_period("M")
month_year_timespan = pd.date_range(first_review_month.to_timestamp(), last_review_month.to_timestamp(), freq = 'M').to_period('M')
month_year_timespan_df = pd.DataFrame(month_year_timespan, columns = ['Year-month'])
def pop_measure_2(df, period):
df = df.groupby(['asin', "Year-month"]).count()
df = df.reset_index()
df = pd.merge(
df,
month_year_timespan_df,
how='right',
right_on='Year-month',
left_on="Year-month")
df.fillna(0, inplace=True)
df.sort_values("Year-month", inplace=True)
df['cumsum_rating'] = df['rating'].cumsum()
df['rating_shift'] = df['cumsum_rating'].shift(period)
df['month_diff'] = df['cumsum_rating'] - df['rating_shift']
return df[['asin', 'Year-month',
'month_diff']].loc[df['month_diff'].idxmax()]
# Computing popularity measure equal to the biggest 3-month-period review increase
# (the biggest first derivative over accumulated number of reviews with delta eqaul 3 month)
period = 3
ratings_gb_df = ratings_df[['asin', 'Year-month', 'rating']].groupby('asin')
popularity_measure2_df = pd.DataFrame()
for idx, book in enumerate(ratings_gb_df):
temp = pd.DataFrame(book[1], copy=True)
popularity_measure2_df = pd.concat([popularity_measure2_df, pop_measure_2(temp, period)])
asins = popularity_measure2_df.values[::3].flatten()
ym = popularity_measure2_df.values[1::3].flatten()
max_diff = popularity_measure2_df.values[2::3].flatten()
popularity_measure2_df = pd.DataFrame.from_dict({'asin': asins, 'Year-month': ym, 'month_diff': max_diff})
popularity_measure2_df = pd.read_csv("popularity_measure_2_part1.csv")
popularity_measure2_df = pd.concat([popularity_measure2_df, pd.read_csv("popularity_measure_2_part2.csv")])
popularity_measure2_df.reset_index(drop=True, inplace=True)
# Check for 0 valued 'asin' column entries that could happen in case of not being 3 months on the market (books having first review in the last 3 months)
unique_asin = set(ratings_df['asin'].values)
popularity_asins = set(popularity_measure2_df['asin'].values)
t = unique_asin - popularity_asins
t = list(t)
popularity_measure2_df = popularity_measure2_df.drop(popularity_measure2_df[popularity_measure2_df['asin'] == '0'].index)
popularity_measure2_df.to_csv("popularity_measure_2.csv", index = False)
popularity_measure2_df = pd.read_csv("popularity_measure_2.csv")
popularity_measure_df = pd.merge(
popularity_measure1_df,
popularity_measure2_df,
how='left',
left_index=True,
right_on='asin')
popularity_measure_df.reset_index(inplace=True)
Statistics for each book:
asincomplete_books_df = pd.merge(ratings_grouped_unstack_df, popularity_measure_df, how='left', left_index=True, right_on='asin')
complete_books_df.drop('index', axis = 1, inplace=True)
complete_books_df.reset_index(inplace=True, drop=True)
complete_books_df = complete_books_df.rename(
index=str,
columns={
"popularity_factor": "pop_metric_1",
"month_diff": "pop_metric_2",
"Weighted_mean": "mean",
"Year-month": "pop_metric_2_date"
})
complete_books_df['pop_metric_2_date'] = pd.to_datetime(
complete_books_df['pop_metric_2_date']).dt.to_period("M")
complete_books_df.drop("Number_of_reviews", axis = 1, inplace=True)
complete_books_df
# Save statistics data to file
complete_books_df.to_csv("books_statistics_df.csv", index=False)
# Read statistics data from file
complete_books_df = pd.read_csv(
"books_statistics_df.csv",
parse_dates=['first_review_date', 'pop_metric_2_date'])
complete_books_df['pop_metric_2_date'] = pd.to_datetime(
complete_books_df['pop_metric_2_date']).dt.to_period("M")
complete_books_df.plot(x = 'mean', y = 'pop_metric_1', kind = 'scatter', alpha = 0.3, figsize=(20,7))
complete_books_df[complete_books_df['days_since_first_review'] > 30].plot(
x='mean', y='pop_metric_1', kind='scatter', alpha=0.3, figsize=(20, 7))
complete_books_df.plot(
x='mean', y='pop_metric_2', kind='scatter', alpha=0.3, figsize=(20, 10))
books_with_categories = pd.read_json("product_categories_enhanced.json", lines = True)
books_with_categories = books_with_categories.set_index('asin', drop = True)
books_with_categories.head()
books_with_categories = pd.merge(
books_with_categories,
complete_books_df,
left_index=True,
right_on='asin',
how='inner')
books_with_categories.sort_values('number_of_reviews', ascending=False, inplace=True)
books_with_categories
sum(books_with_categories['Number_of_reviews'].values)
# Unzipping list of lists of categories, extracting all categories to which book is assigned
all_categories = books_with_categories['categories'].apply(lambda x:
list(set([item for lists in x for item in lists])))
all_categories_frequency = dict(Counter(all_categories.sum()))
len(all_categories_frequency)
# Most popular categories
sorted_all_categories_frequency = sorted(all_categories_frequency.items(), key=operator.itemgetter(1), reverse=True)
sorted_all_categories_frequency[:100]
X = ratings_grouped_unstack_df[['Number_of_reviews', 'Weighted_mean', 'Std_dev', 'Avg_len']].values
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, 15000 + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = MiniBatchKMeans(init='random', n_clusters=n_clusters, batch_size=40000,
n_init=10, max_no_improvement=10, verbose=0, random_state=10)
cluster_labels = clusterer.fit_predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels, sample_size=10000)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
X_sample, cluster_labels_sample = resample(X, cluster_labels, replace = False, n_samples = 15000)
sample_silhouette_values = silhouette_samples(X_sample, cluster_labels_sample)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels_sample == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
print(size_cluster_i)
y_upper = y_lower + size_cluster_i
color = cm.spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.spectral(cluster_labels_sample.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
c=colors, edgecolor='k')
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
c="white", alpha=1, s=200, edgecolor='k')
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
s=50, edgecolor='k')
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
# Initialization of VADER model
sid = SentimentIntensityAnalyzer()
# Create Spark context if it is not running
sc = SparkContext()
sqlContext = SQLContext(sc)
# Loading file with reviews
text_file = sc.textFile("/media/adam/B236CB1D36CAE209/Studia/ADA/reviews_Books_5.json")
df = sqlContext.read.json(text_file)
# Data transformation
df = df.withColumn('unixReviewTime', from_unixtime(df['unixReviewTime']))
df = df.withColumn('reviewTime', to_date(df['unixReviewTime']))
df = df.withColumn('unixReviewTime', df['unixReviewTime'].cast('timestamp'))
# Function to apply on each row of the data - compute the sentiment score for each review
def sentiment_f(row):
rew = sid.polarity_scores(row.reviewText)
summ = sid.polarity_scores(row.summary)
return (row.asin, row.unixReviewTime, row.reviewerID, row.overall,
rew['compound'], rew['neg'], rew['neu'], rew['pos'],
summ['compound'], summ['neg'], summ['neu'], summ['pos'])
# Transform and collect the data about sentiment
sentiment = df.select("asin", "reviewText", "summary", "unixReviewTime",
'overall', "reviewerID").rdd.map(sentiment_f).collect()
sentiment_df = pd.DataFrame(sentiment, columns=['asin', 'unixReviewTime', 'reviewerID', 'overall', 'rew_compound', 'rew_neg', 'rew_neu', 'rew_pos',
'summ_compound', 'summ_neg', 'summ_neu', 'summ_pos'])
sentiment_df.head()
# Save collected data
sentiment_df.to_csv("sentiment.csv", index=False)
# Stop Spark context
sc.stop()
By the previous look at the data, we drew the conclusion that we would focus on the negative and positive ratings. We will therefore, for now, focus on the file giving the number of 1,2,3,4 and 5 stars ratings for each book.
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#create a dataframe with the ratings for each books and the length and total number or reviews
reviews_df_raw=pd.DataFrame.from_csv('ratings_grouped_unstack.csv')
d1=pd.DataFrame.from_csv('avg_length_and_number_of_reviews_per_book.csv')
reviews_df_raw.head()
In this dataset, asin is the Amazon identifier for the book, and the column named 1 (respectively 2, 3, 4 and 5) corresponds to how many reviews were associated with a rating of 1 star (respectively 2, 3, 4 and 5 stars) (every time someone reviews a book they need to rate the book from 1 star to 5).
We can start by plotting the total number of ratings for each number of stars.
x = reviews_df_raw.columns
y = reviews_df_raw.sum()
plt.plot(x,y)
plt.title('Total number of 1 to 5 stars ratings reviews')
plt.ylabel('Number of reviews')
plt.xlabel('Number of stars associated to the review')
plt.xticks([1,2,3,4,5])
Let's try to plot the logarithm of y for this plot to see if we can find something interesting.
y2=np.log(y)
g=plt.plot(x,y2)
plt.title('Logarithm of the total number of 1 to 5 stars ratings reviews')
plt.ylabel('log(number of reviews)')
plt.xlabel('Number of stars associated to the review')
plt.xticks([1,2,3,4,5])
From 2 to 5 stars the logarithm of y gives a straight line! This means that people would give exponentially more 5 stars than 2 stars rating. This is a very interesting discovery that will be needed to be questionned further, by, for example, checking if in the Amazon rules concerning reviews something could explain this.
# merge the two datasets and calculate the "bad ratio" and "good_ratio" for each book
#bad_ratio and good_ratio are defined as the number of 1,2 stars (resp 4,5 stars) divided
#by the total number of reviews
d1['asin']=d1.index
reviews_df_raw['asin']=reviews_df_raw.index
reviews_df_tot=pd.merge(reviews_df_raw, d1, on=['asin'])
reviews_df_tot['bad_ratio']= (reviews_df_tot['1']+reviews_df_tot['2'])/(reviews_df_tot['1']+reviews_df_tot['2']+reviews_df_tot['3']+reviews_df_tot['4']+reviews_df_tot['5'])
reviews_df_tot['good_ratio']= (reviews_df_tot['4']+reviews_df_tot['5'])/(reviews_df_tot['1']+reviews_df_tot['2']+reviews_df_tot['3']+reviews_df_tot['4']+reviews_df_tot['5'])
reviews_df_tot.head()
For our analysis to make sense, we need to take books that have a significative number of reviews as if the book only has one or two reviews that is negative/positive, the book will be directly classified as a bad/good book and this is not reliable. However, if we take a minimum too high for the number of reviews of a book, we will have very few "bad books" as, we will see that the bulk of bad books have very few comments.
We will try with a limit of 10 reviews.
reviews_df = reviews_df_tot.copy()[reviews_df_tot.number_of_reviews>10]
# as a first guess, we can define bad books as having a bad ratio higher than 0.7
#(respectively lower than 0.7 for good books)
#we will compare how many bad books we have if we consider the all dataset and if we consider only the books that have more than 10 reviews (for reliability)
bad_books_tot = reviews_df_tot[reviews_df_tot.bad_ratio>0.7]
bad_books = reviews_df[reviews_df.bad_ratio>0.7]
#As we have already a significant number of good and controversial books with our limit, we just consider with the limit
good_books = reviews_df[reviews_df.good_ratio>0.7]
controversial_books= reviews_df[abs(reviews_df.good_ratio-reviews_df.bad_ratio)<0.1]
bad_books_tot.describe()
bad_books.describe()
As we said before, the number of 'bad books' depends a lot on the limit we take for the minimum number of reviews of our dataset. This might be due to the fact that when books have some bad comments, the books popularity drops and people stop reviewing them very fast.
good_books.describe()
controversial_books.describe()
We can already notice that there are way less "bad books" according to our definition and it is very dependent of the limit that we impose for the minimum reviews taken into account in our dataset. This needs to be analysed further.
As there are many outliers in our dataset, we will plot only robust variables. The first function defined below plots boxplots with outliers and the second one without the outliers.
def plot_categories(column):
# Create a figure with 2 subplots and set the title
fig, ax = plt.subplots(figsize=(15,12), ncols=1, nrows=3)
plt.suptitle("Boxplots of "+ column + " for three categories of books", fontsize=18)
# Adjust the sub plots parameters
hspace = 0.5
plt.subplots_adjust(
hspace = hspace
)
# Set a title to each graph
ax[0].set_title("Bad books", fontsize=15)
ax[1].set_title("Good books", fontsize=15)
ax[2].set_title("Controversial Books", fontsize=15)
# Plot the data
sns.boxplot( bad_books[column], ax=ax[0])
sns.boxplot( good_books[column], ax= ax[1])
sns.boxplot( controversial_books[column], ax=ax[2])
plt.show()
def plot_categories_no_outliers(column):
# Create a figure with 2 subplots and set the title
fig, ax = plt.subplots(figsize=(15,12), ncols=1, nrows=3)
plt.suptitle("Boxplots of "+ column + " for three categories of books", fontsize=18)
# Adjust the sub plots parameters
hspace = 0.5
plt.subplots_adjust(
hspace = hspace
)
# Set a title to each graph
ax[0].set_title("Bad books", fontsize=15)
ax[1].set_title("Good books", fontsize=15)
ax[2].set_title("Controversial Books", fontsize=15)
# Plot the data
sns.boxplot( bad_books[column], ax=ax[0], showfliers=False)
sns.boxplot( good_books[column], ax= ax[1], showfliers=False)
sns.boxplot( controversial_books[column], ax=ax[2], showfliers=False)
plt.show()
plot_categories('Avg_len')
plot_categories_no_outliers('Avg_len')
The average length of the reviews is pretty well balanced through the groups. However, there are way more outliers (longer reviews) in the "good books" and "controversial books" than in the "bad books". The median for controversial books also seems to be a bit higher than for the two other categories. This could be explained by the fact that when people don't agree with each other they are more willing to write longer comments to explain their point of view. However, we see that we need to use more reliable statistics that the bad_ratio and good_ratio that we made up before to be able to draw conclusions.
We can calculate the mean and the standard deviation of ratings for each book. We can use these not robust statistics here as the ratings are bound between 1 and 5 so there are no outliers.
# start again with a copy of our first dataset
reviews_df_2 = reviews_df_tot.copy()
del reviews_df_2['bad_ratio']
del reviews_df_2['good_ratio']
reviews_df_2.head()
# calculate the mean for each book
reviews_df_2 ['mean_rat'] = (reviews_df_2['1']*1+reviews_df_2['2']*2+reviews_df_2['3']*3+reviews_df_2['4']*4+reviews_df_2['5']*5)/reviews_df_2.number_of_reviews
reviews_df_2.head()
#Plot a histogram
fig, ax = plt.subplots(figsize=(15,10), ncols=1, nrows=1)
reviews_df_2.mean_rat.hist(bins=np.linspace(1,5,50), rwidth=0.9)
plt.title('Distribution for the mean ratings of a book')
plt.xlabel('Mean rating')
plt.ylabel('Number of books')
plt.xticks([1,2,3,4,5])
We see that the distribution is completely left skewed and we will try to recentre it between 0 and 1 using this article http://rogeriofvieira.com/wp-content/uploads/2016/05/Data-Transformations-1.pdf.
From this article, we see that we can try two transformations:
#Take the transformated weigthed mean as explained in the article
reviews_df_2['Trans_weighted_mean'] = np.log10(6 - reviews_df_2['mean_rat'].values)
reviews_df_2.plot(y = 'Trans_weighted_mean', kind = 'hist', bins = 50, logx=False)
reviews_df_2['Inverse_weighted_mean'] = 1/(6 - reviews_df_2['mean_rat'].values)
reviews_df_2.plot(y = 'Inverse_weighted_mean', kind = 'hist', bins = 50, logx=False)
We see that this doesn't give very smooth plots and this is due to the books that have just a few ratings and therefore will have values completely equal to 5 for example. As before, we will consider only books that have more than 10 reviews.
reviews_df_sm = reviews_df_2.copy()[reviews_df_tot.number_of_reviews>10]
reviews_df_sm.plot(y = 'mean_rat', kind = 'hist', bins = 100, logx=False)
reviews_df_sm.plot(y = 'Trans_weighted_mean', kind = 'hist', bins = 50, logx=False)
reviews_df_sm.plot(y = 'Inverse_weighted_mean', kind = 'hist', bins = 50, logx=False)
Now the graphs look way better. We can then refine our definitions for "good", "bad" and also add "neutral" books as following:
We will also analyse the standard deviation to define what is a "controversial" book.
After those definitions are set, we can try to see if they would match with a sentimental analysis model like Vader: Vader can predict if a sentence is negative or positive and we could see if a review is negative and positive and compare this to the number of stars it is related to.
# Spark context shutdown
sc.stop()
Our internal milestones up until the report are:
Decide all models of grouping data into categories (negative/positive, fast/slow plateauing of review numbers, etc.)
Decide parameters and thresholds for category definitions based on the data
Enrich data by scraping Amazon for genres as extensively as possible
Create necessary intermediate files containing formatted data, to speed up analysis and computing
Find the best way to use VADER to analyze positivity and negativity of reviews
Calculate correlations between popularity metrics and positive and negative reviews and ratings
If necessary, run analysis on the cluster
Find the exact data story we want to tell (choose exact questions), make it compelling, and link it to social good
Decide format of report
Depending on report format, distribute tasks to individual group members
Produce good, polished visualizations that tell the story
Construct report into a coherent document
Update readme according to progress
Create a plan for presentation and poster creation
The enrichment of the data will be done using the Amazon Scraper.ipynb notebook, which contains a description of its function (please have a look).
In order to use VADER, we will have to decide how to apply its sentence-based logic to entire reviews. The obvious way is to analyze each sentence and take the mean score, but we will look into whether this is the most sensible way or if another way would be more relevant.
In the future, these and many more notebook cells will be filled:
# Retrieve sentiments of reviews:
# TODO...
# Calculate correlations:
# TODO...
# Produce final visualizations:
# TODO...